# install.packages("tidyverse")
# install.packages("timetk")
library(tidyverse)
library(readr) #read_csv()
library(dplyr)
library(glmnet)
library(caret) # may not need anymore
library(timetk)
# Load data:
df <- read_csv("./data/walmart/Walmart.csv")
head(df)
Fix Datatypes, Sort Values
# Fixing datatypes:
# any(grepl("/", df$Date))
print(sapply(df, class))
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI
"numeric" "character" "numeric" "numeric" "numeric" "numeric" "numeric"
Unemployment
"numeric"
df$Date <- gsub("/", "-", df$Date) # replace slashes with dashes, so all dates follow same format
df$Date <- as.Date(df$Date, format = "%d-%m-%Y") # NOTE european-like date: day/month/year
print(sapply(df, class))
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI
"numeric" "Date" "numeric" "numeric" "numeric" "numeric" "numeric"
Unemployment
"numeric"
# Sort values:
df <- df[order(df$Date, decreasing = FALSE), ] # gets an ordering of rows based on date and then df[ix] to select that order
head(df)
print(dim(df))
[1] 6435 8
table(df$Date) # value_counts
2010-02-05 2010-02-12 2010-02-19 2010-02-26 2010-03-05 2010-03-12 2010-03-19 2010-03-26
45 45 45 45 45 45 45 45
2010-04-02 2010-04-09 2010-04-16 2010-04-23 2010-04-30 2010-05-07 2010-05-14 2010-05-21
45 45 45 45 45 45 45 45
2010-05-28 2010-06-04 2010-06-11 2010-06-18 2010-06-25 2010-07-02 2010-07-09 2010-07-16
45 45 45 45 45 45 45 45
2010-07-23 2010-07-30 2010-08-06 2010-08-13 2010-08-20 2010-08-27 2010-09-03 2010-09-10
45 45 45 45 45 45 45 45
2010-09-17 2010-09-24 2010-10-01 2010-10-08 2010-10-15 2010-10-22 2010-10-29 2010-11-05
45 45 45 45 45 45 45 45
2010-11-12 2010-11-19 2010-11-26 2010-12-03 2010-12-10 2010-12-17 2010-12-24 2010-12-31
45 45 45 45 45 45 45 45
2011-01-07 2011-01-14 2011-01-21 2011-01-28 2011-02-04 2011-02-11 2011-02-18 2011-02-25
45 45 45 45 45 45 45 45
2011-03-04 2011-03-11 2011-03-18 2011-03-25 2011-04-01 2011-04-08 2011-04-15 2011-04-22
45 45 45 45 45 45 45 45
2011-04-29 2011-05-06 2011-05-13 2011-05-20 2011-05-27 2011-06-03 2011-06-10 2011-06-17
45 45 45 45 45 45 45 45
2011-06-24 2011-07-01 2011-07-08 2011-07-15 2011-07-22 2011-07-29 2011-08-05 2011-08-12
45 45 45 45 45 45 45 45
2011-08-19 2011-08-26 2011-09-02 2011-09-09 2011-09-16 2011-09-23 2011-09-30 2011-10-07
45 45 45 45 45 45 45 45
2011-10-14 2011-10-21 2011-10-28 2011-11-04 2011-11-11 2011-11-18 2011-11-25 2011-12-02
45 45 45 45 45 45 45 45
2011-12-09 2011-12-16 2011-12-23 2011-12-30 2012-01-06 2012-01-13 2012-01-20 2012-01-27
45 45 45 45 45 45 45 45
2012-02-03 2012-02-10 2012-02-17 2012-02-24 2012-03-02 2012-03-09 2012-03-16 2012-03-23
45 45 45 45 45 45 45 45
2012-03-30 2012-04-06 2012-04-13 2012-04-20 2012-04-27 2012-05-04 2012-05-11 2012-05-18
45 45 45 45 45 45 45 45
2012-05-25 2012-06-01 2012-06-08 2012-06-15 2012-06-22 2012-06-29 2012-07-06 2012-07-13
45 45 45 45 45 45 45 45
2012-07-20 2012-07-27 2012-08-03 2012-08-10 2012-08-17 2012-08-24 2012-08-31 2012-09-07
45 45 45 45 45 45 45 45
2012-09-14 2012-09-21 2012-09-28 2012-10-05 2012-10-12 2012-10-19 2012-10-26
45 45 45 45 45 45 45
There are 45 stores, so this makes sense, that there are 45 entries for each date. Also, each date represents one week.
Normalize weekly sales within each store, so that no one store dominates, and weekly sales are comparable: i.e. (sales_store - mean(sales_store)) / std(sales_store) and can convert back later via (scaled_sales * store_std) + store_mean
df <- df %>%
group_by(Store) %>%
mutate(
sales_mean = mean(Weekly_Sales, na.rm = TRUE),
sales_std = sd(Weekly_Sales, na.rm = TRUE),
weekly_sales_scaled = (Weekly_Sales - sales_mean) / sales_std
) %>%
ungroup()
head(df)
for (store_id in 1:length(unique(df$Store))) {
p <- df %>%
filter(Store == store_id) %>%
ggplot(aes(x = Date, y = weekly_sales_scaled)) +
geom_line(color = "steelblue") +
labs(title = paste("Weekly Sales Over Time - Store", store_id),
x = "Date",
y = "Weekly Sales (scaled per store)")
print(p)
}
NA
NA
# Can't randomize the data first, because it is time-series
split_ix <- floor(.8*nrow(df))
train_df <- df[1:split_ix,]
test_df <- df[(split_ix+1):nrow(df),]
print(dim(train_df))
[1] 5148 11
print(dim(test_df))
[1] 1287 11
cat("train:", format(min(train_df$Date), "%Y-%m-%d"), " to ", format(max(train_df$Date), "%Y-%m-%d"),"\n")
train: 2010-02-05 to 2012-04-13
cat("test:", format(min(test_df$Date), "%Y-%m-%d"), " to ", format(max(test_df$Date), "%Y-%m-%d"),"\n")
test: 2012-04-13 to 2012-10-26
print(colnames(df))
[1] "Store" "Date" "Weekly_Sales"
[4] "Holiday_Flag" "Temperature" "Fuel_Price"
[7] "CPI" "Unemployment" "sales_mean"
[10] "sales_std" "weekly_sales_scaled"
# model.matrix creates a design matrix, auto-encodes categorical as dummy vars (if any--in this case, holiday is already a double.)
# X_train <- model.matrix(weekly_sales_scaled ~ . - Store - Date - Weekly_Sales - sales_mean - sales_std + 0, data=train_df) # + 0 means no intercept
X_test <- model.matrix(weekly_sales_scaled ~ . - Store - Date - Weekly_Sales - sales_mean - sales_std + 0, data=test_df)
# y_train <- train_df$weekly_sales_scaled
y_test <- test_df$weekly_sales_scaled
# NOTE: i just commented out X_train because realized cv function requires dataframe.
print(dim(X_test))
[1] 1287 5
#print(dim(X_train))
#print(X_train[1:10,])
print(X_test[1:10,])
Holiday_Flag Temperature Fuel_Price CPI Unemployment
1 0 44.42 4.187 137.8680 8.150
2 0 45.68 4.044 214.3127 7.139
3 0 69.03 3.891 221.1484 6.891
4 0 49.89 4.025 141.8434 7.671
5 0 41.81 4.025 137.8680 4.125
6 0 48.65 4.187 137.8680 8.983
7 0 42.46 4.044 214.3127 7.139
8 0 36.90 4.025 137.8680 7.489
9 0 52.22 4.187 141.8434 8.253
10 0 64.28 4.254 131.1080 11.627
Because, it is important to keep the future in the test, or else it could cheat and accidentally learn the past and use it to predict the past, some inherent pattern. i.e. could use info from the future to predict the past (leakage).
cv_folds <- time_series_cv(
data = train_df,
date_var = Date,
cumulative = FALSE, # no growing window (same size folds)
initial = "6 months", # length of train window
assess = "3 months", # length of validation window
skip = "3 months", # jump between folds
slice_limit = 5 # num folds max
)
plot_time_series_cv_plan(cv_folds, .date_var = Date, .value = weekly_sales_scaled)
Set up cross validation folds, for time-series.
n_train <- nrow(train_df)
folds <- createTimeSlices(1:n_train,
initialWindow = floor(0.7*n_train), # 70% is train data
horizon = floor(0.1*n), # 10% is val data
fixedWindow = TRUE) # do not grow the time slice windows with time--do rolling.
str(folds$train[1:3])